In [1]:
conda install pandas
Collecting package metadata (current_repodata.json): done
Solving environment: done


==> WARNING: A newer version of conda exists. <==
  current version: 4.10.3
  latest version: 23.1.0

Please update conda by running

    $ conda update -n base -c defaults conda



# All requested packages already installed.


Note: you may need to restart the kernel to use updated packages.
In [2]:
import pandas as pd
In [3]:
import os
In [4]:
path = os.path.join("data","BA_reviews.csv")
if os.path.isfile(path):
    data = pd.read_csv(path)
else:
    print(f"file {path} does not exist")
In [5]:
data = pd.read_csv("data/BA_reviews.csv")
In [6]:
print(data)
     Unnamed: 0                                            reviews
0             0  ✅ Trip Verified | Excellent service both on th...
1             1  ✅ Trip Verified |  Good lounge at Cape Town. O...
2             2  ✅ Trip Verified |  A really excellent journey....
3             3  ✅ Trip Verified |  This flight was one of the ...
4             4  Not Verified | It seems that there is a race t...
..          ...                                                ...
995         995  ✅ Trip Verified | Flew British Airways from Lo...
996         996  ✅ Trip Verified |  Madrid to London. The main ...
997         997  ✅ Trip Verified |  London to Moscow. British A...
998         998  ✅ Trip Verified |  Miami to London. My most re...
999         999  ✅ Trip Verified |  Gatwick to Barbados in Dece...

[1000 rows x 2 columns]
In [7]:
data.describe(include='all')
Out[7]:
Unnamed: 0 reviews
count 1000.000000 1000
unique NaN 1000
top NaN ✅ Trip Verified | Excellent service both on th...
freq NaN 1
mean 499.500000 NaN
std 288.819436 NaN
min 0.000000 NaN
25% 249.750000 NaN
50% 499.500000 NaN
75% 749.250000 NaN
max 999.000000 NaN
In [8]:
data[data['reviews'].str.contains("Thailand")]
Out[8]:
Unnamed: 0 reviews
777 777 ✅ Trip Verified | I had flown British Airways ...
841 841 ✅ Trip Verified | London to Bangkok. Flew Bri...
In [9]:
data.rename(columns={'Unnamed':'trip_verification'}, inplace=True)
In [10]:
data = pd.read_csv("data/BA_reviews.csv")
data.head()  
Out[10]:
Unnamed: 0 reviews
0 0 ✅ Trip Verified | Excellent service both on th...
1 1 ✅ Trip Verified | Good lounge at Cape Town. O...
2 2 ✅ Trip Verified | A really excellent journey....
3 3 ✅ Trip Verified | This flight was one of the ...
4 4 Not Verified | It seems that there is a race t...
In [11]:
print(data)
     Unnamed: 0                                            reviews
0             0  ✅ Trip Verified | Excellent service both on th...
1             1  ✅ Trip Verified |  Good lounge at Cape Town. O...
2             2  ✅ Trip Verified |  A really excellent journey....
3             3  ✅ Trip Verified |  This flight was one of the ...
4             4  Not Verified | It seems that there is a race t...
..          ...                                                ...
995         995  ✅ Trip Verified | Flew British Airways from Lo...
996         996  ✅ Trip Verified |  Madrid to London. The main ...
997         997  ✅ Trip Verified |  London to Moscow. British A...
998         998  ✅ Trip Verified |  Miami to London. My most re...
999         999  ✅ Trip Verified |  Gatwick to Barbados in Dece...

[1000 rows x 2 columns]
In [12]:
pip install country_list
Requirement already satisfied: country_list in ./opt/anaconda3/lib/python3.9/site-packages (1.0.0)
Note: you may need to restart the kernel to use updated packages.
In [13]:
from country_list import available_languages, countries_for_language

for language in available_languages():
    print(language)
    break
af
In [14]:
countries = dict(countries_for_language('en'))
print(countries['TH'])
Thailand
In [15]:
# Get a list of country names for a given language (in this case, English)
country_list = [country[1] for country in countries_for_language('en')]

# Create a function to extract country names
def extract_country(reviews):
    # Loop through the list of countries and check if the review contains a country name
    for country in country_list:
        if country in reviews:
            return country
    return None


data['countries'] = data['reviews'].apply(extract_country)

# Show the updated data frame
print(data)
     Unnamed: 0                                            reviews countries
0             0  ✅ Trip Verified | Excellent service both on th...      None
1             1  ✅ Trip Verified |  Good lounge at Cape Town. O...      None
2             2  ✅ Trip Verified |  A really excellent journey....      None
3             3  ✅ Trip Verified |  This flight was one of the ...      None
4             4  Not Verified | It seems that there is a race t...      None
..          ...                                                ...       ...
995         995  ✅ Trip Verified | Flew British Airways from Lo...      None
996         996  ✅ Trip Verified |  Madrid to London. The main ...      None
997         997  ✅ Trip Verified |  London to Moscow. British A...      None
998         998  ✅ Trip Verified |  Miami to London. My most re...      None
999         999  ✅ Trip Verified |  Gatwick to Barbados in Dece...  Barbados

[1000 rows x 3 columns]
In [16]:
print(data.tail(5))
     Unnamed: 0                                            reviews countries
995         995  ✅ Trip Verified | Flew British Airways from Lo...      None
996         996  ✅ Trip Verified |  Madrid to London. The main ...      None
997         997  ✅ Trip Verified |  London to Moscow. British A...      None
998         998  ✅ Trip Verified |  Miami to London. My most re...      None
999         999  ✅ Trip Verified |  Gatwick to Barbados in Dece...  Barbados
In [17]:
data[data['reviews'].str.contains("London")]
Out[17]:
Unnamed: 0 reviews countries
7 7 ✅ Trip Verified | Easy check in and staff mem... None
20 20 ✅ Trip Verified | Absolutely terrible experie... None
21 21 ✅ Trip Verified | Vancouver to Delhi via Lond... None
25 25 Not Verified | BA cancelled my flight home, t... None
26 26 ✅ Trip Verified | Turned up 3.5 hours in advan... None
... ... ... ...
994 994 ✅ Trip Verified | Worst BA flight ever! Flew T... None
995 995 ✅ Trip Verified | Flew British Airways from Lo... None
996 996 ✅ Trip Verified | Madrid to London. The main ... None
997 997 ✅ Trip Verified | London to Moscow. British A... None
998 998 ✅ Trip Verified | Miami to London. My most re... None

558 rows × 3 columns

In [18]:
data["trip verification"] = data.reviews.str[:15]
data.tail()
Out[18]:
Unnamed: 0 reviews countries trip verification
995 995 ✅ Trip Verified | Flew British Airways from Lo... None ✅ Trip Verified
996 996 ✅ Trip Verified | Madrid to London. The main ... None ✅ Trip Verified
997 997 ✅ Trip Verified | London to Moscow. British A... None ✅ Trip Verified
998 998 ✅ Trip Verified | Miami to London. My most re... None ✅ Trip Verified
999 999 ✅ Trip Verified | Gatwick to Barbados in Dece... Barbados ✅ Trip Verified
In [19]:
data['reviews'] = data['reviews'].str.replace('✅ Trip Verified', '')
In [20]:
data.tail()
Out[20]:
Unnamed: 0 reviews countries trip verification
995 995 | Flew British Airways from London Heathrow t... None ✅ Trip Verified
996 996 | Madrid to London. The main plus about this... None ✅ Trip Verified
997 997 | London to Moscow. British Airways has down... None ✅ Trip Verified
998 998 | Miami to London. My most recent BA experie... None ✅ Trip Verified
999 999 | Gatwick to Barbados in December 2017. On a... Barbados ✅ Trip Verified
In [21]:
data['reviews'] = data['reviews'].str.replace('Not Verified', '')
In [22]:
data.tail()
Out[22]:
Unnamed: 0 reviews countries trip verification
995 995 | Flew British Airways from London Heathrow t... None ✅ Trip Verified
996 996 | Madrid to London. The main plus about this... None ✅ Trip Verified
997 997 | London to Moscow. British Airways has down... None ✅ Trip Verified
998 998 | Miami to London. My most recent BA experie... None ✅ Trip Verified
999 999 | Gatwick to Barbados in December 2017. On a... Barbados ✅ Trip Verified
In [23]:
data.to_csv("modified_data.csv", index=False)




#questions for data: mention all of the times the British airlines is mentioned
# Count up all of the positive reviews with words such as good great excellent    
In [24]:
print(data)
     Unnamed: 0                                            reviews countries  \
0             0   | Excellent service both on the ground and on...      None   
1             1   |  Good lounge at Cape Town. On time departur...      None   
2             2   |  A really excellent journey. Lounge not ove...      None   
3             3   |  This flight was one of the worst I have ev...      None   
4             4   | It seems that there is a race to the bottom...      None   
..          ...                                                ...       ...   
995         995   | Flew British Airways from London Heathrow t...      None   
996         996   |  Madrid to London. The main plus about this...      None   
997         997   |  London to Moscow. British Airways has down...      None   
998         998   |  Miami to London. My most recent BA experie...      None   
999         999   |  Gatwick to Barbados in December 2017. On a...  Barbados   

    trip verification  
0     ✅ Trip Verified  
1     ✅ Trip Verified  
2     ✅ Trip Verified  
3     ✅ Trip Verified  
4     Not Verified |   
..                ...  
995   ✅ Trip Verified  
996   ✅ Trip Verified  
997   ✅ Trip Verified  
998   ✅ Trip Verified  
999   ✅ Trip Verified  

[1000 rows x 4 columns]
In [25]:
pip install gensim
Requirement already satisfied: gensim in ./opt/anaconda3/lib/python3.9/site-packages (4.3.0)
Requirement already satisfied: scipy>=1.7.0 in ./opt/anaconda3/lib/python3.9/site-packages (from gensim) (1.7.1)
Requirement already satisfied: numpy>=1.18.5 in ./opt/anaconda3/lib/python3.9/site-packages (from gensim) (1.20.3)
Requirement already satisfied: FuzzyTM>=0.4.0 in ./opt/anaconda3/lib/python3.9/site-packages (from gensim) (2.0.5)
Requirement already satisfied: smart-open>=1.8.1 in ./opt/anaconda3/lib/python3.9/site-packages (from gensim) (6.3.0)
Requirement already satisfied: pandas in ./opt/anaconda3/lib/python3.9/site-packages (from FuzzyTM>=0.4.0->gensim) (1.3.4)
Requirement already satisfied: pyfume in ./opt/anaconda3/lib/python3.9/site-packages (from FuzzyTM>=0.4.0->gensim) (0.2.25)
Requirement already satisfied: python-dateutil>=2.7.3 in ./opt/anaconda3/lib/python3.9/site-packages (from pandas->FuzzyTM>=0.4.0->gensim) (2.8.2)
Requirement already satisfied: pytz>=2017.3 in ./opt/anaconda3/lib/python3.9/site-packages (from pandas->FuzzyTM>=0.4.0->gensim) (2021.3)
Requirement already satisfied: six>=1.5 in ./opt/anaconda3/lib/python3.9/site-packages (from python-dateutil>=2.7.3->pandas->FuzzyTM>=0.4.0->gensim) (1.16.0)
Requirement already satisfied: simpful in ./opt/anaconda3/lib/python3.9/site-packages (from pyfume->FuzzyTM>=0.4.0->gensim) (2.9.0)
Requirement already satisfied: fst-pso in ./opt/anaconda3/lib/python3.9/site-packages (from pyfume->FuzzyTM>=0.4.0->gensim) (1.8.1)
Requirement already satisfied: miniful in ./opt/anaconda3/lib/python3.9/site-packages (from fst-pso->pyfume->FuzzyTM>=0.4.0->gensim) (0.0.6)
Requirement already satisfied: requests in ./opt/anaconda3/lib/python3.9/site-packages (from simpful->pyfume->FuzzyTM>=0.4.0->gensim) (2.26.0)
Requirement already satisfied: idna<4,>=2.5 in ./opt/anaconda3/lib/python3.9/site-packages (from requests->simpful->pyfume->FuzzyTM>=0.4.0->gensim) (3.2)
Requirement already satisfied: urllib3<1.27,>=1.21.1 in ./opt/anaconda3/lib/python3.9/site-packages (from requests->simpful->pyfume->FuzzyTM>=0.4.0->gensim) (1.26.7)
Requirement already satisfied: charset-normalizer~=2.0.0 in ./opt/anaconda3/lib/python3.9/site-packages (from requests->simpful->pyfume->FuzzyTM>=0.4.0->gensim) (2.0.4)
Requirement already satisfied: certifi>=2017.4.17 in ./opt/anaconda3/lib/python3.9/site-packages (from requests->simpful->pyfume->FuzzyTM>=0.4.0->gensim) (2021.10.8)
Note: you may need to restart the kernel to use updated packages.
In [26]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim import corpora
In [27]:
# Define a function to preprocess the review data
def preprocess(reviews):
    return [word for word in simple_preprocess(reviews) if word not in STOPWORDS]

# Preprocess the reviews data
processed_reviews = data['reviews'].map(preprocess)

# Create a dictionary from the processed reviews data
dictionary = corpora.Dictionary(processed_reviews)

# Create a bag-of-words representation of the processed reviews data
bow_corpus = [dictionary.doc2bow(reviews) for reviews in processed_reviews]

# Train the topic modeling algorithm using the bag-of-words corpus
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)

# Print the top 10 keywords for each of the 10 topics generated
for idx, topic in lda_model.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic))
Topic: 0 
Words: 0.036*"flight" + 0.021*"ba" + 0.009*"staff" + 0.009*"service" + 0.009*"london" + 0.008*"seat" + 0.008*"check" + 0.007*"food" + 0.007*"time" + 0.006*"plane"
Topic: 1 
Words: 0.024*"flight" + 0.012*"seats" + 0.012*"seat" + 0.011*"ba" + 0.010*"class" + 0.009*"london" + 0.009*"good" + 0.008*"business" + 0.008*"service" + 0.007*"great"
Topic: 2 
Words: 0.024*"flight" + 0.012*"ba" + 0.012*"airways" + 0.011*"london" + 0.011*"british" + 0.009*"good" + 0.009*"time" + 0.008*"service" + 0.008*"food" + 0.008*"airline"
Topic: 3 
Words: 0.022*"flight" + 0.016*"ba" + 0.010*"london" + 0.007*"service" + 0.007*"airport" + 0.006*"customer" + 0.006*"told" + 0.005*"asked" + 0.005*"said" + 0.005*"british"
Topic: 4 
Words: 0.022*"flight" + 0.016*"ba" + 0.012*"cabin" + 0.010*"service" + 0.010*"crew" + 0.010*"seats" + 0.010*"london" + 0.009*"food" + 0.009*"seat" + 0.008*"business"
Topic: 5 
Words: 0.023*"flight" + 0.016*"service" + 0.012*"ba" + 0.010*"london" + 0.009*"class" + 0.008*"business" + 0.007*"time" + 0.007*"food" + 0.007*"check" + 0.007*"crew"
Topic: 6 
Words: 0.032*"flight" + 0.017*"ba" + 0.013*"service" + 0.009*"london" + 0.008*"seat" + 0.007*"staff" + 0.007*"time" + 0.007*"crew" + 0.007*"hours" + 0.006*"aircraft"
Topic: 7 
Words: 0.019*"ba" + 0.015*"flight" + 0.015*"class" + 0.012*"business" + 0.012*"seat" + 0.012*"food" + 0.011*"good" + 0.009*"service" + 0.009*"club" + 0.007*"cabin"
Topic: 8 
Words: 0.018*"ba" + 0.016*"flight" + 0.012*"crew" + 0.010*"london" + 0.010*"service" + 0.009*"time" + 0.009*"heathrow" + 0.008*"cabin" + 0.007*"food" + 0.007*"good"
Topic: 9 
Words: 0.018*"flight" + 0.016*"ba" + 0.010*"london" + 0.009*"seats" + 0.007*"service" + 0.007*"food" + 0.006*"heathrow" + 0.006*"crew" + 0.005*"class" + 0.005*"time"
In [28]:
pip install wordcloud
Requirement already satisfied: wordcloud in ./opt/anaconda3/lib/python3.9/site-packages (1.8.2.2)
Requirement already satisfied: numpy>=1.6.1 in ./opt/anaconda3/lib/python3.9/site-packages (from wordcloud) (1.20.3)
Requirement already satisfied: pillow in ./opt/anaconda3/lib/python3.9/site-packages (from wordcloud) (8.4.0)
Requirement already satisfied: matplotlib in ./opt/anaconda3/lib/python3.9/site-packages (from wordcloud) (3.4.3)
Requirement already satisfied: cycler>=0.10 in ./opt/anaconda3/lib/python3.9/site-packages (from matplotlib->wordcloud) (0.10.0)
Requirement already satisfied: python-dateutil>=2.7 in ./opt/anaconda3/lib/python3.9/site-packages (from matplotlib->wordcloud) (2.8.2)
Requirement already satisfied: kiwisolver>=1.0.1 in ./opt/anaconda3/lib/python3.9/site-packages (from matplotlib->wordcloud) (1.3.1)
Requirement already satisfied: pyparsing>=2.2.1 in ./opt/anaconda3/lib/python3.9/site-packages (from matplotlib->wordcloud) (3.0.4)
Requirement already satisfied: six in ./opt/anaconda3/lib/python3.9/site-packages (from cycler>=0.10->matplotlib->wordcloud) (1.16.0)
Note: you may need to restart the kernel to use updated packages.
In [29]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

reviews = data['reviews']
reviews_text = " ".join(review for review in reviews)
wordcloud = WordCloud(width=800, height=800, min_font_size=10).generate(reviews_text)

plt.figure(figsize=(8, 8), facecolor=None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad=0)

plt.show()
In [30]:
lda_model.save('model_file.lda')
In [31]:
print(data)
     Unnamed: 0                                            reviews countries  \
0             0   | Excellent service both on the ground and on...      None   
1             1   |  Good lounge at Cape Town. On time departur...      None   
2             2   |  A really excellent journey. Lounge not ove...      None   
3             3   |  This flight was one of the worst I have ev...      None   
4             4   | It seems that there is a race to the bottom...      None   
..          ...                                                ...       ...   
995         995   | Flew British Airways from London Heathrow t...      None   
996         996   |  Madrid to London. The main plus about this...      None   
997         997   |  London to Moscow. British Airways has down...      None   
998         998   |  Miami to London. My most recent BA experie...      None   
999         999   |  Gatwick to Barbados in December 2017. On a...  Barbados   

    trip verification  
0     ✅ Trip Verified  
1     ✅ Trip Verified  
2     ✅ Trip Verified  
3     ✅ Trip Verified  
4     Not Verified |   
..                ...  
995   ✅ Trip Verified  
996   ✅ Trip Verified  
997   ✅ Trip Verified  
998   ✅ Trip Verified  
999   ✅ Trip Verified  

[1000 rows x 4 columns]
In [32]:
from collections import Counter
import pandas as pd

# Assuming that 'data' is a DataFrame with a column named 'text' containing text data
words = Counter(" ".join(data['reviews']).split())
most_common_words = words.most_common(10)  # Returns the 10 most common words
print(most_common_words)
[('the', 6210), ('to', 5110), ('and', 4388), ('was', 3380), ('a', 3151), ('I', 2993), ('of', 1985), ('in', 1865), ('on', 1667), ('for', 1594)]
In [ ]:
 
In [33]:
# Assuming 'model' is the trained topic model
num_words = 10  # Number of most common words to print for each topic
topics = lda_model.show_topics(num_topics=-1, num_words=num_words, formatted=False)

for topic in topics:
    print(f"Topic {topic[0]}: ")
    for word, weight in topic[1]:
        print(f"\t{word} ({weight:.2f})")
Topic 0: 
	flight (0.04)
	ba (0.02)
	staff (0.01)
	service (0.01)
	london (0.01)
	seat (0.01)
	check (0.01)
	food (0.01)
	time (0.01)
	plane (0.01)
Topic 1: 
	flight (0.02)
	seats (0.01)
	seat (0.01)
	ba (0.01)
	class (0.01)
	london (0.01)
	good (0.01)
	business (0.01)
	service (0.01)
	great (0.01)
Topic 2: 
	flight (0.02)
	ba (0.01)
	airways (0.01)
	london (0.01)
	british (0.01)
	good (0.01)
	time (0.01)
	service (0.01)
	food (0.01)
	airline (0.01)
Topic 3: 
	flight (0.02)
	ba (0.02)
	london (0.01)
	service (0.01)
	airport (0.01)
	customer (0.01)
	told (0.01)
	asked (0.01)
	said (0.00)
	british (0.00)
Topic 4: 
	flight (0.02)
	ba (0.02)
	cabin (0.01)
	service (0.01)
	crew (0.01)
	seats (0.01)
	london (0.01)
	food (0.01)
	seat (0.01)
	business (0.01)
Topic 5: 
	flight (0.02)
	service (0.02)
	ba (0.01)
	london (0.01)
	class (0.01)
	business (0.01)
	time (0.01)
	food (0.01)
	check (0.01)
	crew (0.01)
Topic 6: 
	flight (0.03)
	ba (0.02)
	service (0.01)
	london (0.01)
	seat (0.01)
	staff (0.01)
	time (0.01)
	crew (0.01)
	hours (0.01)
	aircraft (0.01)
Topic 7: 
	ba (0.02)
	flight (0.02)
	class (0.02)
	business (0.01)
	seat (0.01)
	food (0.01)
	good (0.01)
	service (0.01)
	club (0.01)
	cabin (0.01)
Topic 8: 
	ba (0.02)
	flight (0.02)
	crew (0.01)
	london (0.01)
	service (0.01)
	time (0.01)
	heathrow (0.01)
	cabin (0.01)
	food (0.01)
	good (0.01)
Topic 9: 
	flight (0.02)
	ba (0.02)
	london (0.01)
	seats (0.01)
	service (0.01)
	food (0.01)
	heathrow (0.01)
	crew (0.01)
	class (0.01)
	time (0.01)
In [34]:
# Assuming 'model' is the trained topic model
num_words = 10  # Number of most common words to print for each topic
topics = lda_model.show_topics(num_topics=-1, num_words=num_words, formatted=False)

fig, ax = plt.subplots(figsize=(10, 8))

for topic in topics:
    words = [word for word, weight in topic[1]]
    weights = [weight for word, weight in topic[1]]
    ax.bar(words, weights, alpha=0.8, label=f"Topic {topic[0]}")

ax.set_xlabel("Word")
ax.set_ylabel("Weight")
ax.set_title("Top Words by Topic")
ax.legend()
plt.xticks(rotation=45)
plt.show()
In [35]:
# Extract the value counts for the "trip verification" column
counts = data["trip verification"].value_counts()

fig, ax = plt.subplots(figsize=(6, 6))

ax.bar(counts.index, counts.values, alpha=0.8)

ax.set_xlabel("Verification Status")
ax.set_ylabel("Count")
ax.set_title("Trip Verification Counts")
plt.show()
/Users/ashleyvizcaino/opt/anaconda3/lib/python3.9/site-packages/matplotlib/backends/backend_agg.py:240: RuntimeWarning: Glyph 9989 missing from current font.
  font.set_text(s, 0.0, flags=flags)
/Users/ashleyvizcaino/opt/anaconda3/lib/python3.9/site-packages/matplotlib/backends/backend_agg.py:240: RuntimeWarning: Glyph 10062 missing from current font.
  font.set_text(s, 0.0, flags=flags)
/Users/ashleyvizcaino/opt/anaconda3/lib/python3.9/site-packages/matplotlib/backends/backend_agg.py:203: RuntimeWarning: Glyph 9989 missing from current font.
  font.set_text(s, 0, flags=flags)
/Users/ashleyvizcaino/opt/anaconda3/lib/python3.9/site-packages/matplotlib/backends/backend_agg.py:203: RuntimeWarning: Glyph 10062 missing from current font.
  font.set_text(s, 0, flags=flags)
In [36]:
#if the trip is verfied is is more likley to be a negative review 
In [37]:
negative_words = ["bad", "not good"]
negative_reviews = data.loc[(data["trip verification"] != "trip verified") & (data["reviews"].str.contains("|".join(negative_words))), "reviews"]
for review in negative_reviews:
    print(review)
    break
 |  Vancouver to Delhi via London. We were booked to fly from Vancouver to New Delhi via London Heathrow on Dec 22nd. We received an email on Dec 20th informing us about the industrial action in the UK. I called to find out how it may impact our travel. The representative kind of scared us of being possibly stranded during our travel and offered us a full refund. We called again to confirm about other options but he did not offer any other solutions. We asked to cancel our tickets and he told us about the cancellation fee. We booked new tickets with another airline with double the cost as our trip was urgent (simultaneously). He told us he no longer can cancel our tickets on the system but promised us he will make it happen because he told us we would get a full refund. He processed the cancellation. 5 hours later he emailed us that the tickets couldn’t be cancelled and suggested we cancel our new tickets once he found out they were with another airline (we lost $1000 from cancelling the new tickets). Now, the representative claimed that he never said there won’t be any services available and asked us why we were worrying about the industrial action, and that other “arrangements” will be made if there were any issues (this was new information never given to us before). We kept our British Airway tickets. The next day, we asked to speak with the manager and report that we were misguided about the whole situation. We never heard back from the manager. As we logged in to check-in 15 hours before take off, we found that our VAN — LHR flight was cancelled (due to the bad weather in Vancouver). We did not receive any emails from the airport nor the airline about this change. We called BA again and asked about rearrangements for our flight, to which they offered rebooking with an additional $700-1000 per person price difference and the dates offered were about 7 days after our original flight was supposed to leave. Additionally, to cancel the rest of the flights with them which we could not reach since our first flight was cancelled, we had to pay about $1000 cancellation fee to get a refund. We were very frustrated with the overall experience and no accommodations nor accountability as they wanted to charge us for any service or changes possible. We were on the phone for 1.5 hours while other flights that we could have booked were getting sold out - the customer service person would take 15-20 minutes (for real) to search up each question or option we asked about. It has been a frustrating experience - we lost time and money and it ruined our holidays completely. We then demanded to speak to the manager, who again did not want to take any accountability for their team and gave us a small discount on the cancellation fee. In total, we were on the phone with them for 2 days back and forth, without any resolutions. We felt misguided and misinformed with their unfair policies towards their customers. We did not feel taken care of at all. We had to pay the price for the misinformation and the flights we did not cancel ourselves. After all this trouble, we spent Christmas without our family and the missed the urgent reasons we were travelling for. 
In [38]:
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter

# Tokenize the words in the "review" column and convert to lowercase
all_words = [word.lower() for review in data["reviews"] for word in word_tokenize(review)]

# Define a set of negative words to search for
negative_words = {"bad", "not good", "poor", "terrible", "disappointing", "disappointed", "awful", "horrible", "miserable", "stupid", "sad","nothappy"}

# Count the frequency of each negative word in the "review" column
negative_word_counts = Counter([word for word in all_words if word in negative_words])

# Create a list of negative words and their frequencies, sorted in descending order by frequency
negative_word_list = sorted(negative_word_counts.items(), key=lambda x: x[1], reverse=True)

# Print the list of negative words and their frequencies
for word, count in negative_word_list:
    print(f"{word}: {count}")
poor: 118
bad: 94
terrible: 53
disappointed: 40
awful: 38
disappointing: 32
horrible: 20
sad: 10
miserable: 10
stupid: 2
In [39]:
from nltk.tokenize import word_tokenize
from collections import Counter

# Define a set of negative words to search for
negative_words = {"bad", "not good", "poor", "terrible", "disappointing", "disappointed", "awful", "horrible", "miserable", "stupid", "sad","nothappy"}

# Define a function to count the negative words in a review
def count_negative_words(review):
    # Tokenize the words in the review and convert to lowercase
    words = [word.lower() for word in word_tokenize(review)]
    # Count the frequency of each negative word in the review
    negative_word_counts = Counter([word for word in words if word in negative_words])
    # Return the total count of negative words in the review
    return sum(negative_word_counts.values())

# Create a new column in the data dataframe with the count of negative words for each review
data["negative_word_count"] = data["reviews"].apply(count_negative_words)

# Group the reviews by the "trip verification" column and calculate the mean negative word count for each group
grouped_data = data.groupby("trip verification")["negative_word_count"].mean()

# Print the mean negative word count for each group
print(grouped_data)
trip verification
Not Verified |     0.378698
✅ Trip Verified    0.425814
❎ Not Verified     0.000000
Name: negative_word_count, dtype: float64
In [40]:
from nltk.tokenize import word_tokenize
from collections import Counter

# Define a set of negative words to search for
negative_words = {"bad", "not good", "poor", "terrible", "disappointing", "disappointed", "awful", "horrible", "miserable", "stupid", "sad", "nothappy"}

# Define a function to count the negative words in a review
def count_negative_words(review):
    # Tokenize the words in the review and convert to lowercase
    words = [word.lower() for word in word_tokenize(review)]
    # Count the frequency of each negative word in the review
    negative_word_counts = Counter([word for word in words if word in negative_words])
    # Return the total count of negative words in the review
    return sum(negative_word_counts.values())

# Create a new column in the data dataframe with the count of negative words for each review
data["negative_word_count"] = data["reviews"].apply(count_negative_words)

# Group the reviews by the "trip verification" column and calculate the total negative word count for each group
grouped_data = data.groupby("trip verification")["negative_word_count"].sum()

# Print the total negative word count for each group
print(grouped_data)
trip verification
Not Verified |      64
✅ Trip Verified    353
❎ Not Verified       0
Name: negative_word_count, dtype: int64
In [41]:
# Define a set of negative words to search for
negative_words = {"bad", "not good", "poor", "terrible", "disappointing", "disappointed", "awful", "horrible", "miserable", "stupid", "sad","nothappy"}

# Define a function to count the negative words in a review
def count_negative_words(review):
    # Tokenize the words in the review and convert to lowercase
    words = [word.lower() for word in word_tokenize(review)]
    # Count the frequency of each negative word in the review
    negative_word_counts = Counter([word for word in words if word in negative_words])
    # Return the total count of negative words in the review
    return sum(negative_word_counts.values())

# Create a new column in the data dataframe with the count of negative words for each review
data["negative_word_count"] = data["reviews"].apply(count_negative_words)

# Group the reviews by the "trip verification" column and calculate the count of negative word for each group
grouped_data = data.groupby("trip verification")["negative_word_count"].sum()

# Convert the result to a DataFrame and plot a bar chart
ax = grouped_data.to_frame().plot(kind="bar", legend=False, color="blue")

# Set the chart title and axis labels
ax.set_title("Total Negative Word Counts by Trip Verification")
ax.set_xlabel("Trip Verification")
ax.set_ylabel("Total Negative Word Count")

# Show the chart
plt.show()
/Users/ashleyvizcaino/opt/anaconda3/lib/python3.9/site-packages/matplotlib/backends/backend_agg.py:240: RuntimeWarning: Glyph 9989 missing from current font.
  font.set_text(s, 0.0, flags=flags)
/Users/ashleyvizcaino/opt/anaconda3/lib/python3.9/site-packages/matplotlib/backends/backend_agg.py:240: RuntimeWarning: Glyph 10062 missing from current font.
  font.set_text(s, 0.0, flags=flags)
/Users/ashleyvizcaino/opt/anaconda3/lib/python3.9/site-packages/matplotlib/backends/backend_agg.py:203: RuntimeWarning: Glyph 9989 missing from current font.
  font.set_text(s, 0, flags=flags)
/Users/ashleyvizcaino/opt/anaconda3/lib/python3.9/site-packages/matplotlib/backends/backend_agg.py:203: RuntimeWarning: Glyph 10062 missing from current font.
  font.set_text(s, 0, flags=flags)
In [42]:
# Create a pie chart with the percentage of negative reviews for each trip verification status
plt.pie(grouped_data.values, labels=grouped_data.index, autopct='%1.1f%%')

# Add title
plt.title('Percentage of Negative Reviews by Trip Verification Status')

# Show the chart
plt.show()
In [43]:
positive_words = ["good", "great", "excellent"]
positive_reviews = data.loc[(data["trip verification"] != "trip verified") & (data["reviews"].str.contains("|".join(positive_words))), "reviews"]
for review in positive_reviews:
    print(reviews)
    break
0       | Excellent service both on the ground and on...
1       |  Good lounge at Cape Town. On time departur...
2       |  A really excellent journey. Lounge not ove...
3       |  This flight was one of the worst I have ev...
4       | It seems that there is a race to the bottom...
                             ...                        
995     | Flew British Airways from London Heathrow t...
996     |  Madrid to London. The main plus about this...
997     |  London to Moscow. British Airways has down...
998     |  Miami to London. My most recent BA experie...
999     |  Gatwick to Barbados in December 2017. On a...
Name: reviews, Length: 1000, dtype: object
In [44]:
# Define a set of positive words to search for
positive_words = {"good", "excellent", "awesome", "fantastic", "great", "amazing", "superb", "wonderful", "happy", "satisfied"}

# Define a function to count the positive words in a review
def count_positive_words(reviews):
    # Tokenize the words in the review and convert to lowercase
    words = [word.lower() for word in word_tokenize(reviews)]
    # Count the frequency of each positive word in the review
    positive_word_counts = Counter([word for word in words if word in positive_words])
    # Return the total count of positive words in the review
    return sum(positive_word_counts.values())

# Create a new column in the data dataframe with the count of positive words for each review
data["positive_word_count"] = data["reviews"].apply(count_positive_words)

# Group the reviews by the "trip verification" column and calculate the mean positive word count for each group
grouped_data = data.groupby("trip verification")["positive_word_count"].mean()

# Print the mean positive word count for each group
print(grouped_data)
trip verification
Not Verified |     0.792899
✅ Trip Verified    0.884198
❎ Not Verified     0.000000
Name: positive_word_count, dtype: float64
In [45]:
# Define a set of positive words to search for
positive_words = {"good", "great", "excellent", "awesome", "fantastic", "amazing", "love", "like", "enjoy", "happy", "satisfied"}

# Define a function to count the positive words in a review
def count_positive_words(reviews):
    # Tokenize the words in the review and convert to lowercase
    words = [word.lower() for word in word_tokenize(reviews)]
    # Count the frequency of each positive word in the review
    positive_word_counts = Counter([word for word in words if word in positive_words])
    # Return the total count of positive words in the review
    return sum(positive_word_counts.values())

# Create a new column in the data dataframe with the count of positive words for each review
data["positive_word_count"] = data["reviews"].apply(count_positive_words)

# Group the reviews by the "trip verification" column and calculate the count of positive words for each group
grouped_data = data.groupby("trip verification")["positive_word_count"].sum()

# Print the count of positive words for each group
print(grouped_data)
trip verification
Not Verified |     161
✅ Trip Verified    873
❎ Not Verified       0
Name: positive_word_count, dtype: int64
In [46]:
import matplotlib.pyplot as plt

# Define a set of positive words to search for
positive_words = {"good", "great", "excellent", "fantastic", "awesome", "amazing", "wonderful", "happy"}

# Define a function to count the positive words in a review
def count_positive_words(review):
    # Tokenize the words in the review and convert to lowercase
    words = [word.lower() for word in word_tokenize(review)]
    # Count the frequency of each positive word in the review
    positive_word_counts = Counter([word for word in words if word in positive_words])
    # Return the total count of positive words in the review
    return sum(positive_word_counts.values())

# Create a new column in the data dataframe with the count of positive words for each review
data["positive_word_count"] = data["reviews"].apply(count_positive_words)

# Group the reviews by the "trip verification" column and calculate the count of positive word occurrences for each group
grouped_data = data.groupby("trip verification")["positive_word_count"].sum()

# Plot the bar chart
fig, ax = plt.subplots()
grouped_data.plot(kind="bar", ax=ax)

# Set the chart title and axis labels
ax.set_title("Count of Positive Reviews by Trip Verification")
ax.set_xlabel("Trip Verification")
ax.set_ylabel("Count")

# Show the chart
plt.show()
In [47]:
import matplotlib.pyplot as plt

# Define a set of positive words to search for
positive_words = {"good", "great", "excellent", "amazing", "wonderful", "fantastic", "terrific", "awesome", "satisfying", "pleasing", "enjoyable"}

# Define a function to count the positive words in a review
def count_positive_words(reviews):
    # Tokenize the words in the review and convert to lowercase
    words = [word.lower() for word in word_tokenize(reviews)]
    # Count the frequency of each positive word in the review
    positive_word_counts = Counter([word for word in words if word in positive_words])
    # Return the total count of positive words in the review
    return sum(positive_word_counts.values())

# Create a new column in the data dataframe with the count of positive words for each review
data["positive_word_count"] = data["reviews"].apply(count_positive_words)

# Group the reviews by the "trip verification" column and calculate the total positive word count for each group
grouped_data = data.groupby("trip verification")["positive_word_count"].sum()

# Plot a pie chart of the positive review counts
plt.pie(grouped_data, labels=grouped_data.index, autopct='%1.1f%%')
plt.title("Positive Review Counts by Trip Verification")
plt.show()
In [48]:
# Define a set of positive words to search for
positive_words = {"good", "great", "excellent", "amazing", "wonderful", "fantastic", "terrific", "awesome", "satisfying", "pleasing", "enjoyable"}

# Define a function to count the positive words in a review
def count_positive_words(review):
    # Tokenize the words in the review and convert to lowercase
    words = [word.lower() for word in word_tokenize(review)]
    # Count the frequency of each positive word in the review
    positive_word_counts = Counter([word for word in words if word in positive_words])
    # Return the total count of positive words in the review
    return sum(positive_word_counts.values())

# Create a new column in the data dataframe with the count of positive words for each review
data["positive_word_count"] = data["reviews"].apply(count_positive_words)

# Find the countries with the most positive reviews
positive_reviews = data[data["positive_word_count"] > 0]
positive_reviews_by_country = positive_reviews.groupby("countries")["positive_word_count"].count()
best_countries = positive_reviews_by_country.sort_values(ascending=False)

# Print the best countries
print(best_countries.head(10))
countries
Singapore       10
South Africa     6
Qatar            4
Australia        4
Barbados         4
France           4
Egypt            3
Mexico           3
Japan            3
Canada           3
Name: positive_word_count, dtype: int64
In [49]:
import plotly.express as px

# Replace "best_countries" with your own variable name for the grouped data
fig = px.choropleth(best_countries, locations=best_countries.index, locationmode="country names", color="positive_word_count", 
                    hover_name=best_countries.index, projection="natural earth", title="Countries with the most positive reviews")
fig.show()
In [50]:
# Define a set of negative words to search for
negative_words = {"bad", "not good", "poor", "terrible", "disappointing", "disappointed", "awful", "horrible", "miserable", "stupid", "sad", "nothappy"}

# Define a function to count the negative words in a review
def count_negative_words(review):
    # Tokenize the words in the review and convert to lowercase
    words = [word.lower() for word in word_tokenize(review)]
    # Count the frequency of each negative word in the review
    negative_word_counts = Counter([word for word in words if word in negative_words])
    # Return the total count of negative words in the review
    return sum(negative_word_counts.values())

# Create a new column in the data dataframe with the count of negative words for each review
data["negative_word_count"] = data["reviews"].apply(count_negative_words)

# Find the countries with the most negative reviews
negative_reviews = data[data["negative_word_count"] > 0]
negative_reviews_by_country = negative_reviews.groupby("countries")["negative_word_count"].count()
worst_countries = negative_reviews_by_country.sort_values(ascending=False)

# Print the worst countries
print(worst_countries)
countries
Singapore       11
France           5
Japan            4
Barbados         4
Canada           4
South Africa     3
Jersey           3
Australia        2
Qatar            2
Egypt            2
Mexico           2
Mauritius        2
India            2
Spain            1
New Zealand      1
Malaysia         1
Iceland          1
Italy            1
Bahrain          1
Greece           1
Gibraltar        1
Ghana            1
Germany          1
Cyprus           1
Bulgaria         1
Bermuda          1
Belgium          1
Thailand         1
Name: negative_word_count, dtype: int64
In [51]:
import plotly.express as px

# Create a new dataframe with the count of negative reviews by country
negative_reviews_count = negative_reviews_by_country.reset_index(name='count')

# Create a choropleth map based on the negative reviews count by country
fig = px.choropleth(negative_reviews_count, locations="countries", locationmode='country names', color="count",
                    title="Negative Reviews by Country", color_continuous_scale="Reds")
fig.show()